# citizen forecasting 2020: a state-by-state experiment
# andreas murr & mike lewis-beck
# analysis

# ==========
# = set up =
# ==========

# clear working memory

rm(list=ls())

# write leave one out cross-validation function

loo.errors = function(x){
	e = x$residuals
	h = lm.influence(x)$hat
	sqrt(mean((e / (1 - h))^2))
}

# load packages

library(arm)

# load data

load("AnesInterestNationState.RData")
load("AnesNation.RData")
load("AnesState.RData")
load("MturkInterestNationState.RData")
load("MturkNation3.RData")
load("MturkState3.RData")

# ===========
# = table 1 =
# ===========

data = na.omit(AnesNation)
m.tab1 = lm(I(pop.dem*100) ~ I(p.dem*100), data=data)
display(m.tab1, digits=1)
loo.errors(m.tab1)

# ===========
# = table 2 =
# ===========

# popular vote

m1 = lm(pop.dem ~ p.dem, data=AnesNation[AnesNation$year < 2008,])
m2 = lm(pop.dem ~ p.dem, data=AnesNation[AnesNation$year < 2012,])
m3 = lm(pop.dem ~ p.dem, data=AnesNation[AnesNation$year < 2016,])

y.hat = rep(NA, 3)
y.hat[1] = predict(m1, AnesNation[AnesNation$year == 2008,])
y.hat[2] = predict(m2, AnesNation[AnesNation$year == 2012,])
y.hat[3] = predict(m3, AnesNation[AnesNation$year == 2016,])

y = rep(NA, 3)
y[1] = AnesNation$pop.dem[AnesNation$year == 2008]
y[2] = AnesNation$pop.dem[AnesNation$year == 2012]
y[3] = AnesNation$pop.dem[AnesNation$year == 2016]

t1 = round(cbind(y, y.hat, y-y.hat)*100,1)

# electoral vote (data)

y.hat = rep(NA, 3)
y = rep(NA, 3)
j = 1
for (i in c(2008, 2012, 2016)){
	y.hat[j] = sum(with(AnesState[AnesState$year==i,], ifelse(forecast=="t" | is.na(forecast)==TRUE, electoralv / 2, ifelse(forecast=="d", electoralv, 0))))
	y[j] = sum(with(AnesState[AnesState$year==i,], ifelse(winner=="d", electoralv, 0)))
	j = j + 1
}

table(AnesState$forecast[AnesState$year==2008], useNA="always")
table(AnesState$forecast[AnesState$year==2012], useNA="always")
table(AnesState$forecast[AnesState$year==2016], useNA="always")

with(AnesState[AnesState$year==2008,], sum(electoralv[is.na(forecast)]))

t2 = cbind(y, y.hat, y-y.hat)

# electoral vote (data)

y.hat = rep(NA, 3)
y = rep(NA, 3)
j = 1
for (i in c(2008, 2012, 2016)){
	train = AnesState[AnesState$year < i,]
	test = AnesState[AnesState$year == i,]
	m = lm(dem.two.s ~ p.dem, data=train, weights=train$n)
	test$y.hat = ifelse(predict(m, test) > .5, "d", "r")
	y.hat[j] = sum(with(test, ifelse(is.na(y.hat)==TRUE, electoralv / 2, ifelse(y.hat=="d", electoralv, 0))))
	y[j] = sum(with(test, ifelse(winner=="d", electoralv, 0)))
	j = j + 1
}

t3 = cbind(y, y.hat, y-y.hat)

# combined

cbind(t1, t2, t3)

# ===========
# = table 3 =
# ===========

data = na.omit(AnesState)
m.tab3 = lm(I(dem.two.s*100) ~ I(p.dem*100), data=data, weights=data$n)
display(m.tab3, digits=1)

J = length(unique(data$year))
rmse = rep(NA, nrow=J)
for (j in 1:J){
	data.train = data[data$year%in%unique(data$year)[-j],]
	m = lm(dem.two.s ~ p.dem, data=data.train, weights=data.train$n)
	data.fit = data[data$year==unique(data$year)[j],]
	data.fit$y.hat = predict(m, data.fit)
	rmse[j] = with(data.fit, sqrt(mean(((dem.two.s - y.hat)*100)^2)))
}
mean(rmse)

# ===========
# = table 4 =
# ===========

MturkNation3$forecast.data = ifelse(MturkNation3$p.dem>.5, "Biden", "Trump")
y.hat = predict(m.tab1, MturkNation3) / 100
forecast.model = ifelse(y.hat > .5, "Biden", "Trump")
cbind(MturkNation3[c(3,2,5)], y.hat, forecast.model)

# ===========
# = table 5 =
# ===========

MturkState3$y.hat = predict(m.tab3, MturkState3) / 100
MturkState3[c("state","p.dem","y.hat")]

# ===========
# = table 6 =
# ===========

sum(with(MturkState3, electoralv*(p.dem>.5)))
sum(with(MturkState3, electoralv*(p.dem<.5)))
sum(with(MturkState3, electoralv*(p.dem==.5)))

sum(with(MturkState3, electoralv*(y.hat>.5)))
sum(with(MturkState3, electoralv*(y.hat<.5)))
sum(with(MturkState3, electoralv*(y.hat==.5)))

# ============
# = figure 1 =
# ============

# create colouring for map, needs to be pasted in the svg file

paste(paste("#", MturkState3$state[MturkState3$forecast=="Biden"], collapse=",", sep=""), "{fill: #CCCCCC;}", sep=" ")
paste(paste("#", MturkState3$state[MturkState3$forecast=="Trump"], collapse=",", sep=""), "{fill: #525252;}", sep=" ")

# ==============
# = discussion =
# ==============

# are the state forecasts randomly?  we can reject the hypothesis at the p<.001 level

d = with(MturkState3, p.dem*n)
r = with(MturkState3, n-d)
summary(glm(cbind(d,r)~1,family=binomial))

# proportion of forecasts that are the same for state and nation

mean(with(MturkInterestNationState[MturkInterestNationState$forecast.state%in%c("d","r") & MturkInterestNationState$forecast.nation%in%c("d","r"),],  forecast.state==forecast.nation))

mean(with(AnesInterestNationState[AnesInterestNationState$forecast.state%in%c("d","r") & AnesInterestNationState$forecast.nation%in%c("d","r"),],  forecast.state==forecast.nation))

# interest

mean(MturkInterestNationState$interest, na.rm=TRUE)

# ==========================
# = supplementary material =
# ==========================

# table a1

se = with(MturkNation3, sqrt(p.dem*(1-p.dem)/n))
round((MturkNation3$p.dem + c(-1,1)*1.96*se)*100, 1)
round(predict(m.tab1, MturkNation3) + c(-1,1)*loo.errors(m.tab1), 1)

# table a2

library(binom)
tab.a1 = with(MturkState3, binom.confint(p.dem*n, n, methods="wilson"))
tab.a1$data.tossup = ifelse(tab.a1$lower < .5 & tab.a1$upper > .5, 1, 0)
tab.a1$state = MturkState3$state
tab.a1$data.lower.r = round(tab.a1$lower*100, 1)
tab.a1$data.upper.r = round(tab.a1$upper*100, 1) 

tab.a1$model.lower = MturkState3$y.hat - mean(rmse)/100
tab.a1$model.upper = MturkState3$y.hat + mean(rmse)/100
tab.a1$model.tossup = ifelse(tab.a1$model.lower < .5 & tab.a1$model.upper > .5, 1, 0)

tab.a1$model.lower.r = round(tab.a1$model.lower*100, 1)
tab.a1$model.upper.r = round(tab.a1$model.upper*100, 1)

tab.a1[c("state", "data.lower.r", "data.upper.r", "data.tossup", "model.lower.r", "model.upper.r", "model.tossup")]

# table a3

	# rmse electoral vote (data)
	sel = unique(AnesState$year)[tapply(AnesState$forecast, AnesState$year, function(x){sum(is.na(x))!=length(x)})]
	y.hat = rep(NA, length(sel))
	y = rep(NA, length(sel))
	j = 1
	for (i in sel){
		y.hat[j] = sum(with(AnesState[AnesState$year==i,], ifelse(forecast=="t" | is.na(forecast)==TRUE, electoralv / 2, ifelse(forecast=="d", electoralv, 0))))
		y[j] = sum(with(AnesState[AnesState$year==i,], ifelse(winner=="d", electoralv, 0)))
		j = j + 1
	}
	rmse.data = sqrt(mean((y-y.hat)^2))
	# rmse electoral vote (model)
	y.hat = rep(NA, length(sel)-1)
	y = rep(NA, length(sel)-1)
	j = 1
	for (i in sel[-1]){
		train = AnesState[AnesState$year < i,]
		test = AnesState[AnesState$year == i,]
		m = lm(dem.two.s ~ p.dem, data=train, weights=train$n)
		test$y.hat = ifelse(predict(m, test) > .5, "d", "r")
		y.hat[j] = sum(with(test, ifelse(is.na(y.hat)==TRUE, electoralv / 2, ifelse(y.hat=="d", electoralv, 0))))
		y[j] = sum(with(test, ifelse(winner=="d", electoralv, 0)))
		j = j + 1
	}
	rmse.model = sqrt(mean((y-y.hat)^2))
	# table
	round(sum(with(MturkState3, electoralv*(p.dem>.5)))+c(-1,1)*rmse.data)
	round(sum(with(MturkState3, electoralv*(p.dem<.5)))+c(-1,1)*rmse.data)
	round(sum(with(MturkState3, electoralv*(y.hat>.5)))+c(-1,1)*rmse.model)
	round(sum(with(MturkState3, electoralv*(y.hat<.5)))+c(-1,1)*rmse.model)	

# ===================
# = end source code =
# ===================